## load required libraries
library(tidyverse)
library(quanteda)
library(lexicon)
library(reshape2)
library(stringi)
library(quanteda.textplots)
library(quanteda.textmodels)
library(quanteda.textstats)
library(gridExtra)
library(seededlda)
library(ggrepel)
library(ggdendro)
library(factoextra)
library(lattice)
library(spacyr)## clean workspace
rm(list=ls())## set working directory (WD)
path <- '~/coliphi21/practice_lessons/lesson_2/src/'
## you can also set it dynamically:
## setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd(path)## check that WD is set correctly
getwd()## [1] "/Users/lucienbaumgartner/coliphi21/practice_lessons/lesson_2/src"
For this tutorial you can either work with your own data, or the pre-built copora provided in the /input-folder for the first practice session. The quanteda-package also contains pre-built corpora you can use. For this session, we scraped the Stanford Encyclopedia of Philosophy and built a corpus including additional metadata.
## relative path
load('../input/stanford-encyclopedia.RDS')## absolute path
load('~/coliphi21/practice_lessons/lesson_2/input/stanford-encyclopedia.RDS')Loading the data above will import a pre-built corpus object into R, which is called sfe.
## how does the corpus object look like?
sfe## Corpus consisting of 1,712 documents and 21 docvars.
## 18thGerman-preKant.json :
## " In Germany, the eighteenth century was the age of enlighten..."
##
## abduction.json :
## " In the philosophical literature, the term abduction is used..."
##
## abelard.json :
## " Peter Abelard (1079–21 April 1142) [Abailard or Abaelard or..."
##
## abhidharma.json :
## " The first centuries after Śākyamuni Buddha death saw the ri..."
##
## abilities.json :
## " In the accounts we give of one another, claims about our ab..."
##
## abner-burgos.json :
## " Abner of Burgos (Alfonso de Valladolid; c. 1260–1347) was p..."
##
## [ reached max_ndoc ... 1,706 more documents ]
## summary statistics
summary(sfe) %>% head## available variables
docvars(sfe)Familiarize yourself a little more with the data.
## tokenization
toks <- tokens(sfe, what = 'word',
remove_punct = T, remove_symbols = T, padding = F,
remove_numbers = T, remove_url = T)
## to lower
toks <- tokens_tolower(toks)
## lemmatizing
toks <- tokens_replace(toks,
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma)
## remove stopwords
toks <- tokens_select(toks, pattern = stopwords("en"), selection = "remove")
## remove noise
toks <- tokens_select(toks, pattern = '^[A-z]$|[0-9]+|^.$', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>%
dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")dfm_sfe## Document-feature matrix of: 1,712 documents, 24,689 features (98.47% sparse) and 21 docvars.
## features
## docs ethos immanuel thomasius pietist thomasians wolff well dis halle pietism
## 18thGerman-preKant.json 2 1 33 6 11 36 0 1 19 7
## abduction.json 0 0 0 0 0 0 0 0 0 0
## abelard.json 0 0 0 0 0 0 0 0 0 0
## abhidharma.json 0 0 0 0 0 0 0 0 0 0
## abilities.json 0 0 0 0 0 0 0 0 0 0
## abner-burgos.json 0 0 0 0 0 0 0 0 0 0
## [ reached max_ndoc ... 1,706 more documents, reached max_nfeat ... 24,679 more features ]
topfeatures(dfm_sfe, n=200)## gene disability spinoza turing avicenna buddhist heidegger ibn chinese spacetime nietzsche algebra gödel hilbert einstein husserl dewey computation reid molecular supervenience luck peirce intuitionistic ockham du cardinal marx simulation privacy brentano utilitarianism algorithm ordinal kuhn neural bacon oppression clarke jones monism capability conscience tarski racial popper weyl fictional african trope arabic theism
## 2017 1908 1765 1612 1553 1522 1512 1509 1450 1320 1283 1263 1262 1225 1222 1178 1174 1148 1127 1122 1113 1090 1084 1070 1056 1038 1024 1001 976 975 966 966 964 930 916 913 910 906 900 900 889 863 860 857 855 855 855 853 847 845 842 841
## coercion inheritance ramsey averroes dna ai gravity strawson malebranche domination reichenbach user distributive artist artifact darwin relativism scotus maimonides whitehead pythagoras feminism climate ca consequentialist dao goodman physicalism algebraic fitness grace anderson wright diagram hole arrow berlin bce searle curve folk bayesian molecule bolzano noun fodor boethius newtonian pythagorean wolff confucian ball
## 839 836 833 830 827 822 816 812 808 804 796 795 784 780 778 775 775 773 772 772 767 764 763 757 757 755 754 745 742 740 738 735 731 731 730 730 727 726 726 725 725 720 719 715 713 712 710 708 705 700 696 694
## demonstrative pornography matrix parmenides profile evidential buddhism egalitarian artistic ritual principia recursive cicero albert bradley cosmological austin sovereign adaptation bell japanese conservation nagel chalmers sidgwick torture deterministic plotinus architecture cancer sage adams liberalism miracle entailment triangle imagery brouwer dworkin herder christ ce stream scripture desert romantic thick self-consciousness electron plural collins sartre
## 694 693 691 687 683 673 673 670 664 658 657 655 652 651 649 648 644 644 643 642 641 641 638 637 637 636 631 628 627 626 625 622 621 618 616 614 614 611 610 608 607 607 607 605 604 603 601 596 595 595 595 595
## berkeley naturalist affective atomism nozick twin tense statue armstrong update proclus entropy temperature income buddha bohr payoff chisholm quale instant weber salmon metric pluralist coercive celestial poem introspection ross bentham graph liberation platonism sovereignty bois node constitutional dynamical mystical slavery gas digital vector supervene
## 593 593 592 592 591 590 589 586 583 579 579 576 575 575 574 574 573 573 573 570 570 570 569 566 564 563 562 562 562 561 561 559 558 557 557 557 556 555 554 553 553 552 552 549
Check whether there is still some noise in the data and remove it. Hint: Scan through the topfeatures.
## remove phi
toks <- tokens_select(toks, pattern = 'φ', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>%
dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")## compute model
sfe_ca <- textmodel_ca(dfm_sfe)## coerce model coefficients to dataframe
sfe_ca <- data.frame(dim1 = coef(sfe_ca, doc_dim = 1)$coef_document,
dim2 = coef(sfe_ca, doc_dim = 2)$coef_document)
sfe_ca$id <- gsub('\\.json(\\.[0-9])?', '', rownames(sfe_ca))
sfe_ca## plot full data with branch annotation
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
geom_point(aes(color=dim1-dim2), alpha = 0.2) +
# plot 0.2 of all labels, using a repel function
geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 15, seed = 6734) +
theme_bw() +
theme(plot.title = element_text(face='bold')) +
labs(title = 'Correspondence Analysis: Full Data')## plot parts of the data
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
geom_point(aes(color=dim1-dim2), alpha = 0.2) +
# plot 0.2 of all labels, using a repel function
geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 9, seed = 6734) +
scale_y_continuous(limits=c(-2,0)) +
scale_x_continuous(limits=c(-1,1)) +
theme_bw() +
theme(plot.title = element_text(face='bold')) +
labs(title = 'Correspondence Analysis: Zoom')## run naive unsupervised topic model with 10 topics
set.seed(123)
sfe_lda <- textmodel_lda(dfm_sfe, k = 10)## print top 20 terms per topic
terms(sfe_lda, 20)## topic1 topic2 topic3 topic4 topic5 topic6 topic7 topic8 topic9 topic10
## [1,] "supervenience" "gene" "disability" "turing" "spacetime" "ockham" "chinese" "heidegger" "privacy" "ibn"
## [2,] "trope" "molecular" "oppression" "gödel" "einstein" "bacon" "spinoza" "dewey" "theism" "avicenna"
## [3,] "monism" "neural" "african" "algebra" "kuhn" "scotus" "reid" "husserl" "torture" "buddhist"
## [4,] "fictional" "simulation" "racial" "intuitionistic" "popper" "pythagoras" "dao" "du" "user" "maimonides"
## [5,] "bolzano" "dna" "coercion" "computation" "reichenbach" "boethius" "nietzsche" "malebranche" "clinical" "arabic"
## [6,] "brentano" "darwin" "feminism" "ordinal" "weyl" "pythagorean" "confucian" "artist" "whitehead" "averroes"
## [7,] "goodman" "fitness" "capability" "hilbert" "gravity" "parmenides" "sidgwick" "berlin" "delusion" "japanese"
## [8,] "physicalism" "inheritance" "domination" "cardinal" "hole" "proclus" "utilitarianism" "artistic" "theist" "buddha"
## [9,] "strawson" "fodor" "pornography" "tarski" "ramsey" "cicero" "mohists" "spinoza" "theistic" "buddhism"
## [10,] "noun" "ai" "distributive" "peirce" "bayesian" "philo" "conscience" "wolff" "hartshorne" "dharma"
## [11,] "bradley" "artifact" "marx" "algorithm" "entropy" "plotinus" "consequentialist" "nietzsche" "enhancement" "islamic"
## [12,] "austin" "imagery" "egalitarian" "algebraic" "bohr" "sextus" "luck" "herder" "doxastic" "indian"
## [13,] "armstrong" "biologist" "dworkin" "recursive" "newtonian" "porphyry" "laozi" "clarke" "omnipotent" "emptiness"
## [14,] "chisholm" "drift" "coercive" "brouwer" "payoff" "abelard" "zhuangzi" "sartre" "embryo" "mystical"
## [15,] "entailment" "biodiversity" "sovereign" "provable" "feyerabend" "bce" "zhu" "romantic" "suicide" "nishida"
## [16,] "plural" "cancer" "income" "zfc" "dynamical" "iamblichus" "thick" "bois" "internalism" "japan"
## [17,] "intension" "quale" "constitutional" "computable" "bell" "fr" "hutcheson" "artwork" "gratitude" "zen"
## [18,] "implicature" "adaptation" "liberalism" "cantor" "gas" "luther" "confucius" "fichte" "engine" "hebrew"
## [19,] "grice" "genome" "anderson" "diagram" "mach" "timaeus" "relativism" "schopenhauer" "csm" "vasubandhu"
## [20,] "meinong" "digital" "republican" "definable" "bet" "sophist" "wang" "collins" "goldman" "al-fārābī"
## plot the topics over the correspondence analysis data
sfe_ca$topics <- topics(sfe_lda)
ggplot(sfe_ca, aes(x=dim1, y=dim2, color=topics)) +
geom_point(alpha = 0.5, shape = '.') +
geom_density_2d(alpha = 0.5) +
theme_bw() +
theme(plot.title = element_text(face='bold')) +
labs(title = 'Correspondence Analysis with Topic Annotation (k=10)')Change the names of the topics (to some meaningful description) before plotting.
sfe_ca$topics <- recode(sfe_ca$topics, topic1 = "body-mind", topic2 = "biology",
topic3 = "feminism/critical thinking", topic4 = "math/ai",
topic5 = "physics", topic6 = "classics", topic7 = "eastern",
topic8 = "phenomenology", topic9 = "religion",
topic10 = "middle-eastern/eastern")## set seed
set.seed(48621)
## draw a random sample of 20 documents
sfe_sub <- sfe[sample(1:length(sfe), 5)]
sfe_sub## Corpus consisting of 5 documents and 21 docvars.
## albert-saxony.json :
## " Albert of Saxony (ca. 1320–1390), Master of Arts at Paris, ..."
##
## contractarianism.json.1 :
## " Contractarianism names both a political theory of the legit..."
##
## preferences.json.1 :
## " The notion of preference has a central role in many discipl..."
##
## plotinus.json :
## " Plotinus (204/5 – 270 C.E.), is generally regarded as the f..."
##
## paternalism.json :
## " Paternalism is the interference of a state or an individual..."
## PoS-tagging
sfe_pos <- spacy_parse(sfe_sub, pos = T, tag = T, lemma = T, entity = T, dependency = T)
sfe_pos## look up which adjectives are used most frequently
sfe_pos %>%
filter(pos == 'ADJ') %>%
group_by(token) %>%
summarise(n.occurences = n()) %>%
arrange(desc(n.occurences))## look up which nouns are preceded by the adjective "rational"
rational_noun <- sfe_pos %>% filter(pos == 'NOUN' & lag(token, 1) == 'rational')
rational_noun# get top 2 nouns per document
rational_noun %>%
group_by(doc_id, token) %>%
summarise(n.occurences = n()) %>%
arrange(doc_id, desc(n.occurences)) %>%
slice(1:2)## to create a corpus-object from your pos-tagged tokens
## we need unique IDs
sfe_pos## make doc_ids unique
sfe_pos <- mutate(sfe_pos, doc_id = make.unique(doc_id))
## remove punctuation and spaces
sfe_pos <- filter(sfe_pos, !pos %in% c('PUNCT', 'SPACE'))
## make token corpus
sfe_pos <- corpus(sfe_pos, text_field = 'token', docid_field = 'doc_id')
sfe_pos## Corpus consisting of 35,214 documents and 8 docvars.
## albert-saxony.json.1 :
## "Albert"
##
## albert-saxony.json.2 :
## "of"
##
## albert-saxony.json.3 :
## "Saxony"
##
## albert-saxony.json.5 :
## "ca"
##
## albert-saxony.json.6 :
## "."
##
## albert-saxony.json.7 :
## "1320–1390"
##
## [ reached max_ndoc ... 35,208 more documents ]
docvars(sfe_pos)## WARNING! This data-structure is incompatible with our document-based corpus!!!
docvars(sfe_sub)## ... but we can add the info to our token corpus
# add initial document ID to both sets of docvars
docvars(sfe_pos)$initial_docid <- gsub('\\.json.*', '', docid(sfe_pos))
docvars(sfe_sub)$initial_docid <- gsub('\\.json.*', '', docid(sfe_sub))
# join by initial id
docvars(sfe_pos) <- left_join(docvars(sfe_pos), docvars(sfe_sub), by = 'initial_docid')
docvars(sfe_pos)## Keep in mind: your corpus is still on token level!## hierarchical clustering - get distances on normalized dfm
sfe_dist_mat <- dfm_weight(dfm_sfe, scheme = "prop") %>%
textstat_dist(method = "euclidean") %>%
as.dist()
## hiarchical clustering the distance object
sfe_cluster <- hclust(sfe_dist_mat, method = 'ward.D')
# label with document names
sfe_cluster$labels <- gsub('\\.json(\\.[0-9])?', '', docnames(dfm_sfe))
## determine best numbers of clusters
# fviz_nbclust(as.matrix(sfe_dist_mat), FUN = hcut, method = "wss")
## cut tree into four groups
clusters <- cutree(sfe_cluster, k = 4)
## add cluster-data to the correspondence analysis
sfe_ca_hcl <- left_join(sfe_ca, data.frame(cluster = clusters, id = names(clusters)))
## plot
ggplot(sfe_ca_hcl, aes(x=dim1, y=dim2, label=id)) +
geom_point(aes(color=as.factor(cluster)), alpha = 0.2) +
facet_grid(~as.factor(cluster))## hierarchical clustering doesn't provide discrete cluster along
## the dimensions of the correspondance analysis## subset documents about logic
logic <- dfm_subset(dfm_sfe, grepl('(?<=\\-)logic|logic(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarity
logic_sim <- textstat_simil(logic, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .4
as.data.frame(logic_sim) %>%
filter(cosine > .4) %>%
arrange(desc(cosine))Redo the cosine similarities for another subset of documents.
## subset documents about aesthetics
aesth <- dfm_subset(dfm_sfe, grepl('aesthetics', docnames(dfm_sfe), perl = T))
## compute cosine similarity
aesth <- textstat_simil(aesth, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .2
as.data.frame(aesth) %>%
filter(cosine > .2) %>%
arrange(desc(cosine))## subset documents about feminism
fem <- dfm_subset(dfm_sfe, grepl('(?<=\\-)fem|fem.*(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarities for the features
## "empowerment", "embodiment", and "rape"
fem_sim <- textstat_simil(logic, logic[, c("empowerment", "embodiment", "rape")],
margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>%
group_by(feature2) %>%
arrange(feature2, desc(cosine)) %>%
slice_head(n=5)Redo the cosine similarities for a different set of features.
fem_sim <- textstat_simil(logic, logic[, c("feminism", "patriarchy")],
margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>%
group_by(feature2) %>%
arrange(feature2, desc(cosine)) %>%
slice_head(n=5)A work by Lucien Baumgartner
lucien.baumgartner@philos.uzh.ch
https://lucienbaumgartner.github.io/" class="fa fa-home">